Prerequisites

In [1]:
# pip install PyMuPDF                    # (install PyMuPDF for extracting info from PDF files)
# pip install tika                       # (install tika for extracting paragraphs from PDF files)
# pip install spacy==2.2.0               # (install spacy for lemmatization)
# conda install gensim                   # (intall gesim for topic modelling)
# pip install pyLDAvis                   # (install pyLDAvis for topic modelling visulisation)
# conda install -c conda-forge pyldavis  # (if you use aconda to install pyLADvis)
In [2]:
import pandas as pd
import numpy as np
import nltk; nltk.download('stopwords') 
from nltk.corpus import stopwords      # import stop words
stop_words = stopwords.words('english')
[nltk_data] Error loading stopwords: <urlopen error [Errno 11004]
[nltk_data]     getaddrinfo failed>
In [3]:
import re
from pprint import pprint

# glob for extracting the directories of metadata
import glob

# PyMuPDF
import fitz

# tika
import tika               
from tika import parser   

# Gensim
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel

# spacy for lemmatization
import spacy

# Visualisation
import plotly.express as px
import pyLDAvis
import pyLDAvis.gensim_models
import matplotlib.pyplot as plt
%matplotlib inline

# Enable logging for gensim - optional
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.ERROR)

import warnings
warnings.filterwarnings("ignore",category=DeprecationWarning)
import os
F:\Anaconda\lib\site-packages\sklearn\decomposition\_lda.py:28: DeprecationWarning: `np.float` is a deprecated alias for the builtin `float`. To silence this warning, use `float` by itself. Doing this will not modify any behavior and is safe. If you specifically wanted the numpy scalar type, use `np.float64` here.
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  EPS = np.finfo(np.float).eps

Import pdf files, data wrangling and overview

In [4]:
# Extract the directories of the PDF files
pdf_dir = "D:\LEON\Business Analytics\Study\9. Business Project\Data set\Crossrail"
pdf_files = glob.glob("%s/*.pdf" % pdf_dir)
pdf_files[3]
Out[4]:
'D:\\LEON\\Business Analytics\\Study\\9. Business Project\\Data set\\Crossrail\\British_Railways_(London)_Bill_Lords_(By_Order)_20_Jan_1988.pdf'
In [5]:
# Use PyMuPDF to extract all info of the PDF files (text, title, date, etc)
list_metadata = []
for i in pdf_files:
    with fitz.open(i) as doc:
        info = doc.metadata
        info['file_name'] = os.path.basename(i)
        text = ''
        for page in doc:
            text+= page.getText()
        info['Content'] = text
        
    list_metadata.append(info)
mupdf: cmsOpenProfileFromMem failed
In [6]:
df = pd.DataFrame(list_metadata)
df['document_id'] = df.index
df.head(3)
Out[6]:
format title author subject keywords creator producer creationDate modDate trapped encryption file_name Content document_id
0 PDF 1.7 Katherine A Bloomfield Microsoft® Word 2019 Microsoft® Word 2019 D:20201012160043+01'00' D:20201012160043+01'00' None 10-Year_Transport_Plan_02_Jul_2002.pdf 10-Year Transport Plan \n \n6. \n \nMr. Geoffr... 0
1 PDF 1.4 pdftk 2.02 - www.pdftk.com itext-paulo-155 (itextpdf.sf.net-lowagie.com) D:20190730052236Z D:20190730052236Z None 10_year_plan_for_transport_21_May_2002.pdf House of Commons \nTransport, Local Government... 1
2 PDF 1.4 Microsoft Word - 021588 stellent PScript5.dll Version 5.2 Acrobat Distiller 5.0.5 (Windows) D:20041008151844+01'00' D:20041008151844+01'00' None A_New_deal_for_Transport_Better_for_everyone_W... AnNew deal for Transport: Better for everyone ... 2
In [7]:
df = df.drop_duplicates(subset = 'Content') # drop duplicate rows
df = df.dropna(subset=['Content']) # drop rows whose text content is NaN
df['Word_count'] = df ['Content'].str.count(' ') + 1
df.info()
<class 'pandas.core.frame.DataFrame'>
Int64Index: 229 entries, 0 to 228
Data columns (total 15 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   format        229 non-null    object
 1   title         229 non-null    object
 2   author        229 non-null    object
 3   subject       229 non-null    object
 4   keywords      229 non-null    object
 5   creator       229 non-null    object
 6   producer      229 non-null    object
 7   creationDate  229 non-null    object
 8   modDate       229 non-null    object
 9   trapped       229 non-null    object
 10  encryption    5 non-null      object
 11  file_name     229 non-null    object
 12  Content       229 non-null    object
 13  document_id   229 non-null    int64 
 14  Word_count    229 non-null    int64 
dtypes: int64(2), object(13)
memory usage: 28.6+ KB

Word count

In [8]:
# Word count
df['Word_count'].sum( )
Out[8]:
11132849
In [10]:
# Word count distribution
#import seaborn as sns
#ax1 = sns.distplot(df['Word_count'])
#ax1.set(title = 'Word Count Distribution',
#       xlabel = 'Word Count of Each Document');

Tokenization

In [11]:
data = df.Content.values.tolist()
In [12]:
def sent_to_words(sentences):
    for sentence in sentences:
        yield(gensim.utils.simple_preprocess(str(sentence).encode('utf-8'), deacc=True))  # deacc=True removes punctuations

data_words= list(sent_to_words(data))

Processing words:

Remove Stopwords, Make Bigrams and Trigrams,Lemmatisation, remove short words and meaningless words

In [13]:
# Build the bigram and trigram models
bigram = gensim.models.Phrases(data_words, min_count=5, threshold=100) # higher threshold fewer phrases.
trigram = gensim.models.Phrases(bigram[data_words], threshold=100)  
bigram_mod = gensim.models.phrases.Phraser(bigram)
trigram_mod = gensim.models.phrases.Phraser(trigram)
In [14]:
# Define functions for stopwords, bigrams, trigrams and lemmatization
def remove_stopwords(texts):
    return [[word for word in simple_preprocess(str(doc)) if word not in stop_words] for doc in texts]

def make_bigrams(texts):
    return [bigram_mod[doc] for doc in texts]

def make_trigrams(texts):
    return [trigram_mod[bigram_mod[doc]] for doc in texts]

def lemmatization(texts, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
    """https://spacy.io/api/annotation"""
    texts_out = []
    for sent in texts:
        doc = nlp(" ".join(sent)) 
        texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])
    return texts_out
In [15]:
# Remove Stop Words
data_words_nostops = remove_stopwords(data_words)

# Form Bigrams
# data_words_bigrams = make_bigrams(data_words_nostops)

# Form Trigrams
data_words_trigrams = make_trigrams(data_words_nostops)

# Initialize spacy 'en' model, keeping only tagger component (for efficiency)
# python3 -m spacy download en
nlp = spacy.load('en_core_web_sm', disable=['parser', 'ner'])

#increase the maximum length of text that the parser or NER can process
nlp.max_length = 13000000 #

# Do lemmatization keeping only noun, adj, vb, adv
data_lemmatized1 = lemmatization(data_words_trigrams, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV'])
print(data_lemmatized1[:1])

# Set the length of word threshold for removing the words less than the threshold
minimum_len = 4 #
data_lemmatized = []
for i in data_lemmatized1:
    new_element = [x for x in i if len(x) >= minimum_len]
    data_lemmatized.append(new_element)
print(data_lemmatized[:1])
[['year', 'transport', 'plan', 'cotswold', 'make', 'statement', 'recommendation', 'recent', 'report', 'transport', 'committee', 'relate', 'investment', 'transport', 'state', 'transport', 'committee', 'report', 'normal', 'way', 'addition', 'say', 'appoint', 'job', 'look', 'department', 'policy', 'keep', 'house', 'informed', 'policy', 'development', 'appropriate', 'committee', 'stingingly', 'critical', 'report', 'government', 'year', 'remain', 'concerned', 'lack', 'clarity', 'surround', 'financing', 'rail', 'lack', 'detailed', 'implementation', 'plan', 'major', 'barrier', 'improve', 'railway', 'expect', 'government', 'interim', 'target', 'define', 'work', 'programme', 'indeed', 'government', 'respond', 'report', 'revise', 'plan', 'produce', 'matter', 'smuggle', 'summer_recess', 'parliament', 'sit', 'report', 'table', 'normal', 'way', 'house', 'sit', 'time', 'particularly', 'conservative', 'government', 'funding', 'railway', 'indeed', 'everything_else', 'arrange', 'annual', 'basis', 'government', 'first', 'time', 'make', 'clear', 'money', 'available', 'year', 'public', 'source', 'select', 'committee', 'welcome', 'government', 'put', 'place', 'year', 'plan', 'major', 'step', 'forward', 'transport', 'planning', 'give', 'degree', 'certainty', 'industry', 'never', 'past', 'specific', 'problem', 'raise', 'select', 'committee', 'refer', 'take', 'administration', 'network', 'rail', 'put', 'place', 'possible', 'far', 'clear', 'individual', 'project', 'deliver', 'improvement', 'year', 'year', 'however', 'come', 'back', 'point', 'make', 'may', 'difference', 'tory', 'put', 'place', 'money', 'investment', 'never', 'available', 'conservative', 'government', 'railway', 'much', 'difficulty', 'suffer', 'year', 'underinvestment', 'right', 'commitment', 'railway', 'investment', 'year', 'plan', 'support', 'scheme', 'crossrail', 'line', 'may', 'draw_attention', 'wide', 'area', 'also', 'need', 'increase', 'capacity', 'railway', 'upgrade', 'kent', 'line', 'particular', 'would', 'significant', 'improvement', 'add', 'capacity', 'increase', 'comfort', 'encourage', 'people', 'use', 'railway', 'darle', 'aware', 'problem', 'refer', 'know', 'substantial', 'investment', 'programme', 'introduce', 'new', 'railway', 'carriage', 'also', 'aware', 'problem', 'number', 'railway', 'line', 'work', 'address', 'come', 'back', 'point', 'problem', 'encounter', 'railway', 'could', 'foreseen', 'year', 'ago', 'pay', 'high', 'price', 'successive_government', 'underinvestment', 'share', 'hon_friend', 'frustration', 'crossrail', 'rather', 'inappropriately', 'name', 'thameslink', 'consider', 'doubt', 'demand', 'transport', 'substantial', 'likely', 'far', 'great', 'expansion', 'take', 'place', 'particularly', 'anxious', 'ensure', 'money', 'workable', 'plan', 'place', 'sadly', 'say', 'maidenhead', 'always', 'listen', 'say', 'repeat', 'railtrack', 'legacy', 'appalling', 'lack', 'planning', 'need', 'address', 'government', 'high', 'hope', 'rural', 'bus', 'service', 'however', 'secretary', 'state', 'know', 'full', 'well', 'transport', 'mention', 'early', 'local', 'council', 'prevent', 'enter', 'local', 'arrangement', 'bus', 'company', 'partner', 'age', 'concern', 'provide', 'tailor', 'make', 'demand', 'lead', 'service', 'needy', 'people', 'rural', 'area', 'secretary', 'state', 'come', 'forward', 'proposal', 'remove', 'unnecessary', 'regulation', 'council', 'ham', 'provide', 'quality', 'bus', 'service', 'rural', 'people', 'really', 'need', 'darle', 'listening', 'exchange', 'early', 'right', 'member', 'right', 'minister', 'transport', 'bring', 'back', 'memory', 'last', 'time', 'really', 'deal', 'bus', 'people', 'pick', 'piece', 'bus', 'deregulation', 'introduce', 'last', 'conservative', 'government', 'sympathetic', 'call', 'member', 'side', 'house', 'great', 'use', 'bus', 'believe', 'awful_lot', 'bus', 'service', 'can', 'do', 'train', 'car', 'want', 'encourage', 'great', 'use', 'aware', 'council', 'do', 'lot', 'improve', 'bus', 'service', 'rural', 'well', 'urban', 'council', 'area', 'issue', 'want', 'look', 'next', 'month', 'see', 'improve', 'level', 'bus', 'provision', 'rural', 'area', 'well', 'serve', 'present', 'system', 'believe', 'bus', 'transport', 'could', 'enable', 'people', 'move', 'around', 'present', 'exploit', 'enough', 'agree', 'central', 'railway', 'scheme', 'could', 'make', 'enormous', 'contribution', 'rail', 'freight', 'provide', 'vital', 'economic', 'link', 'north', 'midland', 'make', 'great', 'contribution', 'achieve', 'government', 'year', 'transport', 'plan', 'know', 'want', 'freight', 'user', 'use', 'rail', 'service', 'authority', 'currently', 'consider', 'provide', 'advice', 'report', 'later', 'year', 'keep', 'inform', 'effect', 'condemn', 'plan', 'confidence', 'trick', 'previous', 'secretary', 'state', 'promise', 'would', 'revise', 'plan', 'revise', 'plan', 'publish', 'furthermore', 'secretary', 'state', 'confidence', 'crewe', 'nantwich_mrs_dunwoody', 'extend', 'confidence', 'endorse', 'continued', 'membership', 'rmt', 'darle', 'happily', 'last', 'point', 'lie', 'responsibility', 'suspect', 'member', 'crewe', 'share', 'exactly', 'view', 'may', 'describe', 'substantial', 'point', 'year', 'plan', 'make', 'clear', 'early', 'reply', 'want', 'look', 'aspect', 'department', 'policy', 'currently', 'envisage', 'make', 'revisal', 'year', 'year', 'plan', 'set', 'investment', 'strategy', 'bring', 'much', 'benefit', 'board', 'shall', 'however', 'respond', 'committee', 'say', 'next', 'month', 'shall', 'keep', 'house', 'informed', 'improvement', 'development', 'necessary', 'relation', 'plan', 'year', 'plan', 'investment', 'strategy', 'strategy', 'must', 'capable', 'development', 'improvement', 'intend', 'next', 'come', 'hope', 'say', 'prepared', 'match', 'investment', 'make', 'tory', 'still', 'commit', 'cut', 'suggest', 'far']]
[['year', 'transport', 'plan', 'cotswold', 'make', 'statement', 'recommendation', 'recent', 'report', 'transport', 'committee', 'relate', 'investment', 'transport', 'state', 'transport', 'committee', 'report', 'normal', 'addition', 'appoint', 'look', 'department', 'policy', 'keep', 'house', 'informed', 'policy', 'development', 'appropriate', 'committee', 'stingingly', 'critical', 'report', 'government', 'year', 'remain', 'concerned', 'lack', 'clarity', 'surround', 'financing', 'rail', 'lack', 'detailed', 'implementation', 'plan', 'major', 'barrier', 'improve', 'railway', 'expect', 'government', 'interim', 'target', 'define', 'work', 'programme', 'indeed', 'government', 'respond', 'report', 'revise', 'plan', 'produce', 'matter', 'smuggle', 'summer_recess', 'parliament', 'report', 'table', 'normal', 'house', 'time', 'particularly', 'conservative', 'government', 'funding', 'railway', 'indeed', 'everything_else', 'arrange', 'annual', 'basis', 'government', 'first', 'time', 'make', 'clear', 'money', 'available', 'year', 'public', 'source', 'select', 'committee', 'welcome', 'government', 'place', 'year', 'plan', 'major', 'step', 'forward', 'transport', 'planning', 'give', 'degree', 'certainty', 'industry', 'never', 'past', 'specific', 'problem', 'raise', 'select', 'committee', 'refer', 'take', 'administration', 'network', 'rail', 'place', 'possible', 'clear', 'individual', 'project', 'deliver', 'improvement', 'year', 'year', 'however', 'come', 'back', 'point', 'make', 'difference', 'tory', 'place', 'money', 'investment', 'never', 'available', 'conservative', 'government', 'railway', 'much', 'difficulty', 'suffer', 'year', 'underinvestment', 'right', 'commitment', 'railway', 'investment', 'year', 'plan', 'support', 'scheme', 'crossrail', 'line', 'draw_attention', 'wide', 'area', 'also', 'need', 'increase', 'capacity', 'railway', 'upgrade', 'kent', 'line', 'particular', 'would', 'significant', 'improvement', 'capacity', 'increase', 'comfort', 'encourage', 'people', 'railway', 'darle', 'aware', 'problem', 'refer', 'know', 'substantial', 'investment', 'programme', 'introduce', 'railway', 'carriage', 'also', 'aware', 'problem', 'number', 'railway', 'line', 'work', 'address', 'come', 'back', 'point', 'problem', 'encounter', 'railway', 'could', 'foreseen', 'year', 'high', 'price', 'successive_government', 'underinvestment', 'share', 'hon_friend', 'frustration', 'crossrail', 'rather', 'inappropriately', 'name', 'thameslink', 'consider', 'doubt', 'demand', 'transport', 'substantial', 'likely', 'great', 'expansion', 'take', 'place', 'particularly', 'anxious', 'ensure', 'money', 'workable', 'plan', 'place', 'sadly', 'maidenhead', 'always', 'listen', 'repeat', 'railtrack', 'legacy', 'appalling', 'lack', 'planning', 'need', 'address', 'government', 'high', 'hope', 'rural', 'service', 'however', 'secretary', 'state', 'know', 'full', 'well', 'transport', 'mention', 'early', 'local', 'council', 'prevent', 'enter', 'local', 'arrangement', 'company', 'partner', 'concern', 'provide', 'tailor', 'make', 'demand', 'lead', 'service', 'needy', 'people', 'rural', 'area', 'secretary', 'state', 'come', 'forward', 'proposal', 'remove', 'unnecessary', 'regulation', 'council', 'provide', 'quality', 'service', 'rural', 'people', 'really', 'need', 'darle', 'listening', 'exchange', 'early', 'right', 'member', 'right', 'minister', 'transport', 'bring', 'back', 'memory', 'last', 'time', 'really', 'deal', 'people', 'pick', 'piece', 'deregulation', 'introduce', 'last', 'conservative', 'government', 'sympathetic', 'call', 'member', 'side', 'house', 'great', 'believe', 'awful_lot', 'service', 'train', 'want', 'encourage', 'great', 'aware', 'council', 'improve', 'service', 'rural', 'well', 'urban', 'council', 'area', 'issue', 'want', 'look', 'next', 'month', 'improve', 'level', 'provision', 'rural', 'area', 'well', 'serve', 'present', 'system', 'believe', 'transport', 'could', 'enable', 'people', 'move', 'around', 'present', 'exploit', 'enough', 'agree', 'central', 'railway', 'scheme', 'could', 'make', 'enormous', 'contribution', 'rail', 'freight', 'provide', 'vital', 'economic', 'link', 'north', 'midland', 'make', 'great', 'contribution', 'achieve', 'government', 'year', 'transport', 'plan', 'know', 'want', 'freight', 'user', 'rail', 'service', 'authority', 'currently', 'consider', 'provide', 'advice', 'report', 'later', 'year', 'keep', 'inform', 'effect', 'condemn', 'plan', 'confidence', 'trick', 'previous', 'secretary', 'state', 'promise', 'would', 'revise', 'plan', 'revise', 'plan', 'publish', 'furthermore', 'secretary', 'state', 'confidence', 'crewe', 'nantwich_mrs_dunwoody', 'extend', 'confidence', 'endorse', 'continued', 'membership', 'darle', 'happily', 'last', 'point', 'responsibility', 'suspect', 'member', 'crewe', 'share', 'exactly', 'view', 'describe', 'substantial', 'point', 'year', 'plan', 'make', 'clear', 'early', 'reply', 'want', 'look', 'aspect', 'department', 'policy', 'currently', 'envisage', 'make', 'revisal', 'year', 'year', 'plan', 'investment', 'strategy', 'bring', 'much', 'benefit', 'board', 'shall', 'however', 'respond', 'committee', 'next', 'month', 'shall', 'keep', 'house', 'informed', 'improvement', 'development', 'necessary', 'relation', 'plan', 'year', 'plan', 'investment', 'strategy', 'strategy', 'must', 'capable', 'development', 'improvement', 'intend', 'next', 'come', 'hope', 'prepared', 'match', 'investment', 'make', 'tory', 'still', 'commit', 'suggest']]

Create the Dictionary and Corpus needed for Topic Modeling

In [16]:
# Create Dictionary
id2word = corpora.Dictionary(data_lemmatized)

# Create Corpus
texts = data_lemmatized

# Term Document Frequency
corpus = [id2word.doc2bow(text) for text in texts]

# View
print(corpus[:1])
[[(0, 1), (1, 1), (2, 2), (3, 1), (4, 1), (5, 1), (6, 2), (7, 1), (8, 1), (9, 1), (10, 1), (11, 1), (12, 1), (13, 4), (14, 1), (15, 1), (16, 1), (17, 1), (18, 1), (19, 2), (20, 3), (21, 1), (22, 3), (23, 1), (24, 1), (25, 2), (26, 1), (27, 1), (28, 2), (29, 1), (30, 1), (31, 2), (32, 1), (33, 1), (34, 1), (35, 1), (36, 3), (37, 4), (38, 1), (39, 1), (40, 1), (41, 6), (42, 1), (43, 1), (44, 1), (45, 1), (46, 3), (47, 3), (48, 2), (49, 1), (50, 2), (51, 1), (52, 3), (53, 4), (54, 2), (55, 1), (56, 2), (57, 2), (58, 3), (59, 1), (60, 1), (61, 1), (62, 1), (63, 2), (64, 2), (65, 1), (66, 1), (67, 1), (68, 3), (69, 1), (70, 1), (71, 1), (72, 1), (73, 3), (74, 1), (75, 1), (76, 1), (77, 1), (78, 2), (79, 1), (80, 1), (81, 1), (82, 1), (83, 1), (84, 1), (85, 1), (86, 1), (87, 1), (88, 1), (89, 1), (90, 1), (91, 1), (92, 1), (93, 1), (94, 1), (95, 2), (96, 2), (97, 1), (98, 1), (99, 1), (100, 1), (101, 1), (102, 10), (103, 4), (104, 1), (105, 2), (106, 1), (107, 2), (108, 4), (109, 3), (110, 1), (111, 3), (112, 4), (113, 1), (114, 2), (115, 2), (116, 1), (117, 1), (118, 1), (119, 2), (120, 1), (121, 1), (122, 2), (123, 7), (124, 1), (125, 3), (126, 1), (127, 3), (128, 3), (129, 3), (130, 1), (131, 1), (132, 1), (133, 1), (134, 1), (135, 3), (136, 1), (137, 1), (138, 1), (139, 2), (140, 3), (141, 1), (142, 2), (143, 9), (144, 1), (145, 1), (146, 3), (147, 1), (148, 1), (149, 1), (150, 1), (151, 1), (152, 3), (153, 2), (154, 1), (155, 2), (156, 1), (157, 1), (158, 1), (159, 1), (160, 3), (161, 1), (162, 1), (163, 2), (164, 3), (165, 2), (166, 1), (167, 1), (168, 1), (169, 1), (170, 2), (171, 1), (172, 1), (173, 5), (174, 1), (175, 1), (176, 5), (177, 14), (178, 2), (179, 4), (180, 3), (181, 1), (182, 1), (183, 2), (184, 1), (185, 1), (186, 1), (187, 4), (188, 1), (189, 2), (190, 1), (191, 1), (192, 1), (193, 4), (194, 1), (195, 1), (196, 1), (197, 1), (198, 4), (199, 1), (200, 10), (201, 1), (202, 1), (203, 2), (204, 1), (205, 1), (206, 2), (207, 1), (208, 1), (209, 1), (210, 1), (211, 1), (212, 1), (213, 1), (214, 6), (215, 2), (216, 1), (217, 1), (218, 3), (219, 3), (220, 5), (221, 1), (222, 2), (223, 4), (224, 2), (225, 1), (226, 6), (227, 2), (228, 2), (229, 1), (230, 1), (231, 1), (232, 1), (233, 1), (234, 5), (235, 1), (236, 1), (237, 1), (238, 1), (239, 3), (240, 3), (241, 1), (242, 1), (243, 1), (244, 1), (245, 1), (246, 1), (247, 1), (248, 1), (249, 1), (250, 1), (251, 1), (252, 2), (253, 1), (254, 1), (255, 3), (256, 2), (257, 1), (258, 10), (259, 1), (260, 2), (261, 1), (262, 1), (263, 1), (264, 1), (265, 1), (266, 1), (267, 4), (268, 1), (269, 3), (270, 1), (271, 2), (272, 1), (273, 2), (274, 15)]]

Building LDA Model, Hyperameter (k) tuning

In [17]:
# set training parameters
k = 20
passes = 20
iterations = 100
alpha = 50.0/k   
eta = 0.01
random_state = 12345
minimum_probability = 0
In [36]:
# create the function for computing the coherence score of different models with different number of topics.
def compute_coherence_values(dictionary, corpus, texts, limit, start, step):
    coherence_values = []
    model_list = []
    for k in range(start, limit, step):
        model = gensim.models.LdaModel(num_topics=k, corpus=corpus, id2word=id2word, alpha=alpha, eta=eta, 
                                       iterations=iterations, passes=passes, random_state = random_state, minimum_probability = minimum_probability)
        model_list.append(model)
        coherencemodel = CoherenceModel(model=model, texts=texts, dictionary=dictionary, coherence='c_v')
        coherence_values.append(coherencemodel.get_coherence())

    return model_list, coherence_values
In [ ]:
# apply the function, it might take a long time.
#limit=80; start=0; step=5;
#model_list, coherence_values = compute_coherence_values(dictionary=id2word, corpus=corpus, texts=texts, start=start, limit=limit, step=step)
In [ ]:
# plot the coherence score against number of topics
#x = range(start, limit, step)
#list_num_topics = [i for i in x]
#df_coherence = pd.DataFrame({'Number_of_Topics': list_num_topics, 'Coherence_Score': coherence_values})
#fig1 = px.line(df_coherence, x = 'Number_of_Topics', y="Coherence_Score", title = 'Coherence score against number of topics')
#fig1.update_layout(autosize=False, width=1000, height=400)
#fig1.update_traces(mode = "lines + markers")
#fig1.show()
In [28]:
# num of topics = 35 to get the optimal coherence socre
k = 15
lda_model = gensim.models.LdaModel(
    corpus = corpus,
    id2word = id2word,
    alpha = alpha,
    eta = eta,
    iterations = iterations,
    num_topics = k,
    passes = passes,
    random_state = 12345,
    minimum_probability = minimum_probability)

Classify the paragraphs based on the trained model

Extract paragraphs from documents

In [29]:
# create the funtion for extraction of paragraphs by splitting the documents by new lines
def para_split(i):
    if '\n \n' in i:
        return i.split('\n \n')
    else:
        return i.split('\n\n')
In [30]:
list_paragraphs = []
list_para_id = []
for i in pdf_files:
    j = parser.from_file(i)
    m = j['content']
    para = para_split(m)
    para = [w.replace('\n', '') for w in para]
    para = [x.strip() for x in para if x.strip()] # remove empty elements
    para_id = [x for x in range(len(para))] 
    list_paragraphs.append(para)
    list_para_id.append(para_id)
2021-08-21 22:01:18,618 [MainThread  ] [WARNI]  Failed to see startup log message; retrying...
2021-08-21 22:01:18,618 : WARNING : Failed to see startup log message; retrying...
In [69]:
df_para1 = df.copy()
df_para1['paragraphs'] = list_paragraphs
df_para1['para_id'] = list_para_id
df_para2 = df_para1.apply(pd.Series.explode)
df_para3 = df_para2.reset_index()
df_para4 = df_para3[['creationDate', 'document_id', 'file_name', 'para_id', 'paragraphs']]
df_para4
Out[69]:
creationDate document_id file_name para_id paragraphs
0 D:20201012160043+01'00' 0 10-Year_Transport_Plan_02_Jul_2002.pdf 0 10-Year Transport Plan
1 D:20201012160043+01'00' 0 10-Year_Transport_Plan_02_Jul_2002.pdf 1 6.
2 D:20201012160043+01'00' 0 10-Year_Transport_Plan_02_Jul_2002.pdf 2 Mr. Geoffrey Clifton-Brown (Cotswold) If he w...
3 D:20201012160043+01'00' 0 10-Year_Transport_Plan_02_Jul_2002.pdf 3 The Secretary of State for Transport (Mr. Alis...
4 D:20201012160043+01'00' 0 10-Year_Transport_Plan_02_Jul_2002.pdf 4 Mr. Clifton-Brown Paragraph 84 of the Select C...
... ... ... ... ... ...
148647 D:20201012160658+01'00' 228 West_Coast_Main_Line_05_Mar_2002.pdf 7 Mr. Spellar That was rather ungallant of Oppos...
148648 D:20201012160658+01'00' 228 West_Coast_Main_Line_05_Mar_2002.pdf 8 Chris Grayling (Epsom and Ewell) The Governme...
148649 D:20201012160658+01'00' 228 West_Coast_Main_Line_05_Mar_2002.pdf 9 Mr. Spellar That is fine cheek, coming from a ...
148650 D:20201012160658+01'00' 228 West_Coast_Main_Line_05_Mar_2002.pdf 10 Andrew Bennett (Denton and Reddish) Does my r...
148651 D:20201012160658+01'00' 228 West_Coast_Main_Line_05_Mar_2002.pdf 11 Mr. Spellar I will draw my hon. Friend's comme...

148652 rows × 5 columns

I applied the 148,652 paragraphs above to classfiy their topics, but it's quite time-consuming and I hadn't got the result after 2 hours. So as shown below I provide users a threshold n for selecting the praragraphs with more than n words for classfication. Here I set n = 30, and the number of paragraph to be classified decreased to 52,697, it takes about half and hour to get the result.

In [70]:
n_word_count = 30                                                        # set the threshold of word count
para_word_count = df_para4['paragraphs'].str.split().str.len()           # word count of each paragraph
df_para = df_para4[(para_word_count>=n_word_count)].reset_index()        # select only the paragraphs with word count above the threshold
df_para
Out[70]:
index creationDate document_id file_name para_id paragraphs
0 2 D:20201012160043+01'00' 0 10-Year_Transport_Plan_02_Jul_2002.pdf 2 Mr. Geoffrey Clifton-Brown (Cotswold) If he w...
1 3 D:20201012160043+01'00' 0 10-Year_Transport_Plan_02_Jul_2002.pdf 3 The Secretary of State for Transport (Mr. Alis...
2 4 D:20201012160043+01'00' 0 10-Year_Transport_Plan_02_Jul_2002.pdf 4 Mr. Clifton-Brown Paragraph 84 of the Select C...
3 5 D:20201012160043+01'00' 0 10-Year_Transport_Plan_02_Jul_2002.pdf 5 Mr. Darling The report will be tabled in the n...
4 6 D:20201012160043+01'00' 0 10-Year_Transport_Plan_02_Jul_2002.pdf 6 Clive Efford (Eltham) I welcome my right hon....
... ... ... ... ... ... ...
52692 148647 D:20201012160658+01'00' 228 West_Coast_Main_Line_05_Mar_2002.pdf 7 Mr. Spellar That was rather ungallant of Oppos...
52693 148648 D:20201012160658+01'00' 228 West_Coast_Main_Line_05_Mar_2002.pdf 8 Chris Grayling (Epsom and Ewell) The Governme...
52694 148649 D:20201012160658+01'00' 228 West_Coast_Main_Line_05_Mar_2002.pdf 9 Mr. Spellar That is fine cheek, coming from a ...
52695 148650 D:20201012160658+01'00' 228 West_Coast_Main_Line_05_Mar_2002.pdf 10 Andrew Bennett (Denton and Reddish) Does my r...
52696 148651 D:20201012160658+01'00' 228 West_Coast_Main_Line_05_Mar_2002.pdf 11 Mr. Spellar I will draw my hon. Friend's comme...

52697 rows × 6 columns

Process the paragraphs

In [49]:
# tokenization
data2 = df_para.paragraphs.values.tolist()
data_words2 = list(sent_to_words(data2))
In [50]:
# Remove Stop Words
data_words_nostops2 = remove_stopwords(data_words2)

# Form Trigrams
data_words_trigrams2 = make_trigrams(data_words_nostops2)

# Do lemmatization keeping only noun, adj, vb, adv
data_lemmatized2 = lemmatization(data_words_trigrams2, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV'])
In [51]:
# set the length of word threshold for removing the words less than the threshold
minimum_len = 4 
data_lemmatized2_1 = []
for i in data_lemmatized2:
    new_element = [x for x in i if len(x) >= minimum_len]
    data_lemmatized2_1.append(new_element)

Classify topics of paragraphs

In [52]:
# create the function for converting a list of tuples into a dictionary
def Convert(tup, di):
    di = dict(tup)
    return di
In [53]:
# belong function: classify topics of paragraphs, it might take a long time because there are 148,651 paragraphs in the 11,132,849-word corpus
list_topic_para = []
dictionary_topic_para = {}
for d in data_lemmatized2_1:
    bow = id2word.doc2bow(d)
    belong = lda_model[bow]
    doc_dic = Convert(belong, dictionary_topic_para)
    list_topic_para.append(doc_dic)
    df_topic_para = pd.DataFrame(list_topic_para)
In [71]:
# topic distribution across paragraphs
df_topic_para1_1 = pd.merge(df_para, df_topic_para, how = 'left', left_index=True, right_index=True)
df_topic_para1_1
Out[71]:
index creationDate document_id file_name para_id paragraphs 0 1 2 3 ... 5 6 7 8 9 10 11 12 13 14
0 2 D:20201012160043+01'00' 0 10-Year_Transport_Plan_02_Jul_2002.pdf 2 Mr. Geoffrey Clifton-Brown (Cotswold) If he w... 0.072448 0.062922 0.056170 0.059287 ... 0.075801 0.064500 0.060214 0.068891 0.064993 0.055450 0.077713 0.076425 0.068982 0.070236
1 3 D:20201012160043+01'00' 0 10-Year_Transport_Plan_02_Jul_2002.pdf 3 The Secretary of State for Transport (Mr. Alis... 0.070747 0.070882 0.050330 0.061963 ... 0.062193 0.065695 0.061436 0.068397 0.062524 0.049954 0.093780 0.076701 0.064980 0.063693
2 4 D:20201012160043+01'00' 0 10-Year_Transport_Plan_02_Jul_2002.pdf 4 Mr. Clifton-Brown Paragraph 84 of the Select C... 0.086332 0.063053 0.037152 0.042598 ... 0.065017 0.061546 0.050136 0.055723 0.066761 0.037708 0.116469 0.081088 0.071017 0.066188
3 5 D:20201012160043+01'00' 0 10-Year_Transport_Plan_02_Jul_2002.pdf 5 Mr. Darling The report will be tabled in the n... 0.131403 0.068612 0.025374 0.029843 ... 0.044684 0.044783 0.040568 0.055095 0.052669 0.024974 0.061665 0.052602 0.235253 0.040666
4 6 D:20201012160043+01'00' 0 10-Year_Transport_Plan_02_Jul_2002.pdf 6 Clive Efford (Eltham) I welcome my right hon.... 0.069285 0.069053 0.044518 0.049888 ... 0.078048 0.061086 0.068690 0.062046 0.102345 0.043165 0.064314 0.079124 0.093914 0.056291
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
52692 148647 D:20201012160658+01'00' 228 West_Coast_Main_Line_05_Mar_2002.pdf 7 Mr. Spellar That was rather ungallant of Oppos... 0.096125 0.071386 0.027858 0.041813 ... 0.052810 0.040474 0.051293 0.056721 0.133052 0.026874 0.047339 0.051183 0.162787 0.045006
52693 148648 D:20201012160658+01'00' 228 West_Coast_Main_Line_05_Mar_2002.pdf 8 Chris Grayling (Epsom and Ewell) The Governme... 0.085474 0.068257 0.047044 0.056173 ... 0.066754 0.061583 0.050914 0.060732 0.076330 0.047374 0.057462 0.060376 0.128952 0.061780
52694 148649 D:20201012160658+01'00' 228 West_Coast_Main_Line_05_Mar_2002.pdf 9 Mr. Spellar That is fine cheek, coming from a ... 0.073769 0.074035 0.047510 0.057346 ... 0.059344 0.056209 0.057094 0.060466 0.061084 0.047341 0.054888 0.058834 0.131114 0.070625
52695 148650 D:20201012160658+01'00' 228 West_Coast_Main_Line_05_Mar_2002.pdf 10 Andrew Bennett (Denton and Reddish) Does my r... 0.083192 0.059174 0.047981 0.057561 ... 0.068059 0.071295 0.062680 0.064459 0.084050 0.046833 0.059579 0.062758 0.117769 0.049012
52696 148651 D:20201012160658+01'00' 228 West_Coast_Main_Line_05_Mar_2002.pdf 11 Mr. Spellar I will draw my hon. Friend's comme... 0.065911 0.068391 0.044016 0.043735 ... 0.087748 0.063180 0.086749 0.068249 0.082650 0.043095 0.060870 0.063127 0.098716 0.053991

52697 rows × 21 columns

In [145]:
# save the result to disk
df_topic_para1_1.to_pickle('./df_topic_para1.pkl')
In [146]:
# load the result from disk
df_topic_para1 = pd.read_pickle('./df_topic_para1.pkl') 

Highest N ranked paragraphs overall

In [212]:
df_topic_para1_n = df_topic_para1.copy()
df_topic_para1_n['highest_p'] = df_topic_para1_n.iloc[:, 6:].max(axis = 1)         # get the highest probability among the topic distribution of each paragraph
df_topic_para1_n['salient_topic'] = df_topic_para1.iloc[:, 6:].idxmax(axis = 1)    # get the corresponding topic id
df_topic_para1_n = df_topic_para1_n[['file_name', 'para_id', 'paragraphs', 'salient_topic', 'highest_p']]
In [213]:
# highest 5 ranked paragraphs overall
df_topic_para1_n.nlargest(5,['highest_p'])
Out[213]:
file_name para_id paragraphs salient_topic highest_p
32358 House_of_Commons._Fifth_report_from_the_Transp... 3 TRANSPORT COMMITTEE 35 24 February 1981] [Cont... 8 0.977223
32362 House_of_Commons._Fifth_report_from_the_Transp... 7 TRANSPORT COMMITTEE 209 IQiMay 1981] [Continue... 8 0.969239
32889 House_of_Commons._Transport_Committee._Session... 1 TRANSPORT COMMITTEE 185 \1 March \9%1] [Contin... 8 0.962825
49481 Transport_Select_Committee_The_future_of_light... 0 House of Commons Transport Committee Integrate... 6 0.953018
32360 House_of_Commons._Fifth_report_from_the_Transp... 5 186 MINUTES OF EVIDENCE TAKEN BEFORE THE 13 Ma... 8 0.948732

Highest N ranked paragraphs from each topic

In [174]:
# define the function for extracting the highest N ranked paragraphs from each topic
def top_n_filter(df, top_n):
    list_topic_id = [x for x in range(0,k)]
    list_n_para = []
    list_n_p = []
    for x in range(0, k): 
        n_para = [i for i in df.nlargest(top_n, [x])['paragraphs']]
        n_p = [i for i in df.nlargest(top_n, [x])[x]]
        list_n_para.append(n_para)
        list_n_p.append(n_p)
    pd_n_para = pd.DataFrame({'topic_id': list_topic_id, 'salient_paragraph': list_n_para, 'probability': list_n_p})
    return(pd_n_para.apply(pd.Series.explode))
In [175]:
# highest 2 ranked paragraphs from each topic
top_n_filter(df_topic_para1, 2)
Out[175]:
topic_id salient_paragraph probability
0 0 Lord Falconer of Thoroton My Lords, obviously ... 0.640784
0 0 Lord Falconer of Thoroton My Lords, it is unaf... 0.629251
1 1 somewhere—I think it is that one—is number 19 ... 0.707127
1 1 hall, the big studio there which we designed, ... 0.681743
2 2 £40 million to £60 million £20 million to £40 ... 0.066667
2 2 July 1998 July 1998 July 1998 July 1998 July 1... 0.066667
3 3 1. In the Compulsory Purchase Act 1965 (hereaf... 0.756951
3 3 Clause 12 requires notice of any street works ... 0.692393
4 4 Lord Clinton-Davis My Lords, I thank the Minis... 0.789729
4 4 Lord Brabazon of Tara My Lords, I beg to move ... 0.782108
5 5 Transport for London > Annual Report and State... 0.933402
5 5 FIGURE 4 Percentage chance of peak crowding, f... 0.798811
6 6 House of Commons Transport Committee Integrate... 0.953018
6 6 Ev 204 Transport Committee: Evidence APPENDIX ... 0.929976
7 7 18084. What you were suggesting earlier was a ... 0.554868
7 7 aoi-t House of Commons Transport Committee Ove... 0.53027
8 8 TRANSPORT COMMITTEE 35 24 February 1981] [Cont... 0.977223
8 8 TRANSPORT COMMITTEE 209 IQiMay 1981] [Continue... 0.969239
9 9 Ixx SECOND REPORT FROM FIGURE 4 STATE OF ORDER... 0.880243
9 9 THE TRANSPORT COMMITTEE XXXIX FIGURE 1 BR : CA... 0.875046
10 10 £40 million to £60 million £20 million to £40 ... 0.066667
10 10 July 1998 July 1998 July 1998 July 1998 July 1... 0.066667
11 11 Figure 7a: Road deaths and serious injuries 19... 0.90439
11 11 Appendix D Quality of data The data system for... 0.898391
12 12 Local action Local transport plans New local t... 0.928547
12 12 The New Deal for Transport will make a big dif... 0.907127
13 13 Mr. Michael Portillo (Kensington and Chelsea):... 0.829223
13 13 Mr. Clarke That might be defensible if someone... 0.792528
14 14 Indeed, experts reject the Promoter’s reasons ... 0.706924
14 14 Statement does not mean that document ceases t... 0.666361

Highest N ranked paragraphs for topic K

In [221]:
df_n_topic_k = top_n_filter(df_topic_para1, 2)
topic_id_chosen = 7                    # choose the topic ID
num_para = 2                           # set N
topic_id_filter = df_n_topic_k['topic_id'] == topic_id_chosen
df_n_topic_k[topic_id_filter]
Out[221]:
topic_id salient_paragraph probability
7 7 18084. What you were suggesting earlier was a ... 0.554868
7 7 aoi-t House of Commons Transport Committee Ove... 0.53027

Highest N ranked paragraphs where the belong() function is greater than the threshold for M topics at a time

In [176]:
# selecting the paragraphs where the belong() function is greater than the threshold for M topics at a time
threshold = 1/3                                                    # set threshold 
topic_filter = df_topic_para1.iloc[:, 5:].max(axis=1) > threshold  # set filter
df_topic_para_M = df_topic_para1[topic_filter]                     # extract the qualified paragraphs
df_topic_para_M
Out[176]:
index creationDate document_id file_name para_id paragraphs 0 1 2 3 ... 5 6 7 8 9 10 11 12 13 14
74 255 D:20190730052236Z 1 10_year_plan_for_transport_21_May_2002.pdf 241 45. The Transport Act 2000 makes provisions fo... 0.044262 0.043324 0.030238 0.042949 ... 0.044425 0.077738 0.050043 0.061554 0.047844 0.028773 0.055126 0.335951 0.055171 0.043903
84 283 D:20190730052236Z 1 10_year_plan_for_transport_21_May_2002.pdf 269 51. The power to introduce local charging sche... 0.050313 0.044189 0.026413 0.029249 ... 0.037758 0.063563 0.051164 0.056167 0.045450 0.025352 0.052043 0.334488 0.105345 0.037271
85 285 D:20190730052236Z 1 10_year_plan_for_transport_21_May_2002.pdf 271 52. Hie 10 Year Plan acknowledges that “people... 0.050369 0.035537 0.023224 0.025945 ... 0.046300 0.072777 0.041990 0.058119 0.042016 0.022812 0.055307 0.362818 0.093287 0.034373
153 483 D:20190730052236Z 1 10_year_plan_for_transport_21_May_2002.pdf 469 102. Outside the Department there is overwhelm... 0.039320 0.030659 0.019356 0.022694 ... 0.041283 0.046922 0.039678 0.072998 0.030887 0.019056 0.069240 0.466428 0.044025 0.028318
155 495 D:20190730052236Z 1 10_year_plan_for_transport_21_May_2002.pdf 481 104, The Commission for Integrated Transport b... 0.052787 0.034785 0.020927 0.028427 ... 0.039780 0.051108 0.037862 0.079260 0.044644 0.020515 0.060161 0.390724 0.071802 0.032363
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
52623 148543 D:20201012155409+01'00' 226 Tube_Investment_debate_in_Commons_Chamber_08_D... 347 Mr. Jenkin: The notes say that income from the... 0.157773 0.030668 0.007399 0.015877 ... 0.039327 0.032086 0.023835 0.020724 0.022524 0.007307 0.020543 0.015676 0.562022 0.018751
52624 148545 D:20201012155409+01'00' 226 Tube_Investment_debate_in_Commons_Chamber_08_D... 349 The Parliamentary Under-Secretary of State for... 0.025860 0.025818 0.009835 0.013093 ... 0.015744 0.015999 0.015915 0.019068 0.017515 0.009789 0.015963 0.017362 0.751439 0.016530
52625 148547 D:20201012155409+01'00' 226 Tube_Investment_debate_in_Commons_Chamber_08_D... 351 Mr. Hill: I am sorry. The hon. Gentleman has n... 0.025708 0.014416 0.009230 0.010668 ... 0.015945 0.015724 0.016117 0.023949 0.017505 0.009025 0.018608 0.023142 0.770463 0.012866
52637 148564 D:20201012155409+01'00' 226 Tube_Investment_debate_in_Commons_Chamber_08_D... 368 Division No. 15] [10.15 pm AYES Adams, Mrs Ire... 0.079947 0.039264 0.022127 0.028502 ... 0.031971 0.040964 0.033352 0.036987 0.036890 0.022422 0.054178 0.098775 0.408451 0.032336
52683 148628 D:20201012163723+01'00' 227 Waterloo_Station_14_Mar_2007.pdf 63 Preliminary conclusions from the study suggest... 0.042676 0.076473 0.028136 0.030269 ... 0.051965 0.057587 0.337797 0.056731 0.072070 0.027173 0.040364 0.039031 0.052323 0.047508

3095 rows × 21 columns

In [214]:
# Highest 2 ranked paragraphs where the belong() function is greater than the threshold for M topics at a time
top_n_filter(df_topic_para_M, 2)
Out[214]:
topic_id salient_paragraph probability
0 0 Lord Falconer of Thoroton My Lords, obviously ... 0.640784
0 0 Lord Falconer of Thoroton My Lords, it is unaf... 0.629251
1 1 somewhere—I think it is that one—is number 19 ... 0.707127
1 1 hall, the big studio there which we designed, ... 0.681743
2 2 (3) Subsection (2) shall have effect as if lan... 0.045091
2 2 (3) Subsection (2) shall have effect as if lan... 0.045091
3 3 1. In the Compulsory Purchase Act 1965 (hereaf... 0.756951
3 3 Clause 12 requires notice of any street works ... 0.692393
4 4 Lord Clinton-Davis My Lords, I thank the Minis... 0.789729
4 4 Lord Brabazon of Tara My Lords, I beg to move ... 0.782108
5 5 Transport for London > Annual Report and State... 0.933402
5 5 FIGURE 4 Percentage chance of peak crowding, f... 0.798811
6 6 House of Commons Transport Committee Integrate... 0.953018
6 6 Ev 204 Transport Committee: Evidence APPENDIX ... 0.929976
7 7 18084. What you were suggesting earlier was a ... 0.554868
7 7 aoi-t House of Commons Transport Committee Ove... 0.53027
8 8 TRANSPORT COMMITTEE 35 24 February 1981] [Cont... 0.977223
8 8 TRANSPORT COMMITTEE 209 IQiMay 1981] [Continue... 0.969239
9 9 Ixx SECOND REPORT FROM FIGURE 4 STATE OF ORDER... 0.880243
9 9 THE TRANSPORT COMMITTEE XXXIX FIGURE 1 BR : CA... 0.875046
10 10 (3) Subsection (2) shall have effect as if lan... 0.045092
10 10 (3) Subsection (2) shall have effect as if lan... 0.045092
11 11 Figure 7a: Road deaths and serious injuries 19... 0.90439
11 11 Appendix D Quality of data The data system for... 0.898391
12 12 Local action Local transport plans New local t... 0.928547
12 12 The New Deal for Transport will make a big dif... 0.907127
13 13 Mr. Michael Portillo (Kensington and Chelsea):... 0.829223
13 13 Mr. Clarke That might be defensible if someone... 0.792528
14 14 Indeed, experts reject the Promoter’s reasons ... 0.706924
14 14 Statement does not mean that document ceases t... 0.666361

Overview of topics

The most frequent 10 words of each topic

In [222]:
pprint(lda_model.print_topics())
[(0,
  '0.016*"would" + 0.013*"railtrack" + 0.012*"year" + 0.010*"think" + '
  '0.009*"investment" + 0.008*"take" + 0.008*"make" + 0.008*"public" + '
  '0.008*"company" + 0.007*"cost"'),
 (1,
  '0.019*"would" + 0.016*"crossrail" + 0.009*"committee" + 0.008*"think" + '
  '0.008*"work" + 0.007*"make" + 0.007*"point" + 0.007*"take" + '
  '0.007*"station" + 0.006*"area"'),
 (2,
  '0.009*"transport" + 0.008*"service" + 0.007*"year" + 0.007*"rail" + '
  '0.006*"scheme" + 0.006*"would" + 0.005*"make" + 0.005*"line" + '
  '0.005*"government" + 0.005*"take"'),
 (3,
  '0.029*"work" + 0.023*"shall" + 0.019*"land" + 0.019*"paragraph" + '
  '0.014*"provision" + 0.014*"section" + 0.014*"part" + 0.013*"schedule" + '
  '0.013*"purpose" + 0.011*"railway"'),
 (4,
  '0.023*"bill" + 0.014*"would" + 0.013*"crossrail" + 0.012*"make" + '
  '0.009*"clause" + 0.009*"project" + 0.008*"government" + 0.008*"give" + '
  '0.008*"amendment" + 0.007*"member"'),
 (5,
  '0.014*"line" + 0.012*"train" + 0.012*"year" + 0.011*"safety" + '
  '0.009*"service" + 0.009*"station" + 0.009*"work" + 0.007*"system" + '
  '0.007*"passenger" + 0.006*"project"'),
 (6,
  '0.022*"rail" + 0.020*"transport" + 0.017*"system" + 0.016*"light" + '
  '0.014*"cost" + 0.011*"scheme" + 0.010*"public" + 0.009*"would" + '
  '0.006*"service" + 0.006*"passenger"'),
 (7,
  '0.018*"train" + 0.016*"transport" + 0.015*"ticket" + 0.014*"passenger" + '
  '0.009*"travel" + 0.009*"service" + 0.009*"would" + 0.009*"people" + '
  '0.008*"fare" + 0.008*"operator"'),
 (8,
  '0.023*"transport" + 0.018*"would" + 0.013*"road" + 0.008*"public" + '
  '0.008*"service" + 0.007*"think" + 0.007*"take" + 0.006*"make" + '
  '0.006*"traffic" + 0.006*"information"'),
 (9,
  '0.018*"rail" + 0.014*"service" + 0.014*"railway" + 0.013*"train" + '
  '0.011*"franchise" + 0.011*"passenger" + 0.010*"would" + 0.009*"network" + '
  '0.008*"transport" + 0.007*"freight"'),
 (10,
  '0.009*"transport" + 0.008*"government" + 0.006*"service" + '
  '0.005*"crossrail" + 0.005*"make" + 0.005*"would" + 0.005*"take" + '
  '0.005*"line" + 0.005*"lord" + 0.004*"rail"'),
 (11,
  '0.018*"transport" + 0.014*"department" + 0.011*"service" + 0.011*"road" + '
  '0.011*"report" + 0.010*"year" + 0.008*"local" + 0.008*"target" + '
  '0.008*"include" + 0.008*"work"'),
 (12,
  '0.031*"transport" + 0.017*"local" + 0.012*"road" + 0.011*"plan" + '
  '0.010*"government" + 0.009*"authority" + 0.007*"scheme" + 0.007*"freight" + '
  '0.007*"traffic" + 0.006*"need"'),
 (13,
  '0.016*"government" + 0.014*"transport" + 0.012*"year" + 0.010*"would" + '
  '0.009*"make" + 0.008*"people" + 0.007*"public" + 0.007*"investment" + '
  '0.006*"service" + 0.006*"member"'),
 (14,
  '0.030*"crossrail" + 0.016*"would" + 0.013*"alternative" + 0.012*"consider" '
  '+ 0.010*"route" + 0.010*"environmental" + 0.009*"information" + '
  '0.009*"project" + 0.009*"option" + 0.008*"promoter"')]

Topic distribution across documents

In [226]:
# topic distribution over documents
list_topic = []
dictionary_topic = {}
for d in texts:
    bow = id2word.doc2bow(d)
    belong = lda_model[bow]                        # generate a list of tuples of topic distribution of a document
    belong_dic = Convert(belong, dictionary_topic) # convert the list of tuples into a dictionary
    list_topic.append(belong_dic)           
                      
df_topic_distribution = pd.DataFrame(list_topic)   # convert the list of dictionaries into a dataframe
df_topic = pd.merge(df, df_topic_distribution, how = 'left', left_index=True, right_index=True) # merge with info of documents
df_topic.drop(['title','format','creator', 'producer', 'keywords', 'trapped', 'encryption','subject', 'modDate'], axis = 1)
Out[226]:
author creationDate file_name Content document_id Word_count 0 1 2 3 ... 5 6 7 8 9 10 11 12 13 14
0 Katherine A Bloomfield D:20201012160043+01'00' 10-Year_Transport_Plan_02_Jul_2002.pdf 10-Year Transport Plan \n \n6. \n \nMr. Geoffr... 0 1437 0.046091 0.025581 0.005751 0.010421 ... 0.016342 0.018955 0.016739 0.023464 0.029125 0.005569 0.055000 0.088750 0.511309 0.014800
1 D:20190730052236Z 10_year_plan_for_transport_21_May_2002.pdf House of Commons \nTransport, Local Government... 1 292641 0.116414 0.000963 0.000027 0.000154 ... 0.000489 0.039231 0.003378 0.067757 0.028585 0.000026 0.007787 0.617993 0.116872 0.000163
2 stellent D:20041008151844+01'00' A_New_deal_for_Transport_Better_for_everyone_W... AnNew deal for Transport: Better for everyone ... 2 67602 0.000597 0.000495 0.000098 0.000772 ... 0.019016 0.004444 0.009851 0.000971 0.024285 0.000094 0.018857 0.918314 0.001196 0.000385
3 Katherine A Bloomfield D:20200429153449+01'00' British_Railways_(London)_Bill_Lords_(By_Order... British Railways (London) Bill Lords (By Order... 3 26519 0.001833 0.174634 0.000393 0.004542 ... 0.004299 0.003642 0.005085 0.035104 0.068484 0.000375 0.001093 0.001104 0.428075 0.002353
4 Katherine A Bloomfield D:20201012144724+01'00' British_Railways_Bill_19_Mar_1991.pdf British Railways Bill \n \nOrder for Second Re... 4 3167 0.007888 0.047891 0.002888 0.031235 ... 0.018869 0.017723 0.032891 0.021458 0.204248 0.002736 0.008439 0.018922 0.307411 0.007159
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
224 Katherine A Bloomfield D:20201012164224+01'00' Transport__London_Underground_07_May_2008.pdf Transport: London Underground \n \nLord Bridge... 224 512 0.254691 0.034235 0.015145 0.017681 ... 0.062010 0.028310 0.026871 0.037356 0.028373 0.016293 0.030588 0.030522 0.077234 0.037115
225 Katherine A Bloomfield D:20201012164258+01'00' Transport__Rail_and_Air_Travel_06_Mar_2008.pdf Transport: Rail and Air Travel \n \nWhat is mi... 225 5173 0.006610 0.021962 0.001528 0.003640 ... 0.005300 0.046562 0.016488 0.013617 0.104275 0.001472 0.034401 0.204020 0.524514 0.004688
226 Katherine A Bloomfield D:20201012155409+01'00' Tube_Investment_debate_in_Commons_Chamber_08_D... Tube Investment \n \n[Relevant documents: Seve... 226 59658 0.021269 0.000975 0.000149 0.000874 ... 0.000752 0.002586 0.000753 0.001191 0.000849 0.000145 0.001007 0.003048 0.964035 0.000561
227 Katherine A Bloomfield D:20201012163723+01'00' Waterloo_Station_14_Mar_2007.pdf Waterloo Station \n \n11:00:00 \n \nSusan Kra... 227 4714 0.009605 0.034907 0.001724 0.005358 ... 0.012611 0.010449 0.209508 0.007894 0.287885 0.001658 0.018974 0.006365 0.373946 0.007304
228 Katherine A Bloomfield D:20201012160658+01'00' West_Coast_Main_Line_05_Mar_2002.pdf West Coast Main Line \n \n4. \n \nDr. Phyllis ... 228 691 0.088522 0.047916 0.011369 0.019174 ... 0.042224 0.028160 0.032036 0.034187 0.192715 0.010964 0.031289 0.032409 0.326357 0.030015

229 rows × 21 columns

Topic distribution of the whole corpus

As shown below there are only 8 topics with topic distribution above the threshold (1/K); according to the PTBI proposed by Marchetti and Puranam (2020), the 8 topics are the salient topics worth interpreting. However, we extracted 35 topics and the 8 topics are not enough to interpret all the documents, so we prefer to interpret all the topics

In [54]:
topic_distribution = df_topic_distribution.sum()/df_topic_distribution.sum().sum()
topics_distribution = pd.DataFrame({'topic_id': topic_distribution.index, 'topic_distribution': topic_distribution, 
                                    'Not_less_than_threshold': topic_distribution >= 1/k})


topics_distribution.sort_values(by = 'topic_distribution', ascending = False)
Out[54]:
topic_id topic_distribution Not_less_than_threshold
13 13 0.287765 True
4 4 0.111648 True
9 9 0.099050 True
8 8 0.090998 True
12 12 0.077961 True
0 0 0.067110 True
5 5 0.060436 False
11 11 0.056854 False
6 6 0.043037 False
1 1 0.034460 False
3 3 0.023916 False
7 7 0.020929 False
14 14 0.020357 False
2 2 0.002752 False
10 10 0.002729 False
In [55]:
fig2 = px.bar(topics_distribution, x='topic_id', y='topic_distribution',title='Topic distribution of the whole corpus')
fig2.update_layout(autosize=False, width=1000, height=300)
fig2.show()

Topic interpretation

To interpret the topics, I combined the word frequncy demonstrated by pyLDAvis with prototypical documents or paragraphs suggested by PTBI proposed by Marchetti and Puranam (2020)

Word frequency of each topic

In [227]:
# Visualize the topics
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim_models.prepare(lda_model, corpus, id2word, sort_topics = False )
vis
Out[227]:

Prototypical texts for each topic

I followed the method of extraction of prototypical text suggested by PTBI proposed by Marchetti and Puranam (2020). Its heart lies in that for parameter L (probability that a document belongs to a topic), at least 1/L documents with probablity that they belong to the topic >= L are needed to interpret the topics, this method tries to find out the opitimal value of L to maximize the percentage of interpretable topics (Marchetti and Puranam, 2020, p. 20)

In [228]:
List_num_doc = [x for x in range(1, 20, 1)] # generate a list of 1/L (minimum number of documents to interpret a topic)
list_L = [1/x for x in List_num_doc]        # generate a list of L
In [229]:
# create the function for computing the percentage of potentially interpretable topics against parameter L
def perc(i, df):
    list_num_topics = []
    for j in df:                                  
        topic_filter = df[j] >= i         
        m = df[j][topic_filter].count()           
        list_num_topics.append(m)                                             
        count1 = sum(map(lambda x : x >= 1/i, list_num_topics))                                     
        perc1 = count1 / k
    return(perc1)

The following chart shows that the percentage of potentially interpretable topics for “high enough” levels of L is not large enough, so the paragraph-based interpretation can be explored.

In [230]:
list_perc1 = []
for i in list_L:
    num = perc(i, df_topic_distribution)
    list_perc1.append(num)

df_L1 = pd.DataFrame({'Parameter L': list_L, 'Percentage of interpretable topics': list_perc1})
fig_L1 = px.line(df_L1, x = 'Parameter L', y="Percentage of interpretable topics", title = 'Value selection for parameter L (document-based)')
fig_L1.update_layout(autosize=False, width=1200, height=400)
fig_L1.update_traces(mode = "lines + markers")
fig_L1.show()

The following chart shows that when L = 0.5, the the percentage of interpretable topics is 86.7%, so we set L = 0.5 - ie, each topic needs at least 2(1/L) paragraphs to be interpreted.

In [231]:
df_topic_para
Out[231]:
0 1 2 3 4 5 6 7 8 9 10 11 12 13 14
0 0.072448 0.062922 0.056170 0.059287 0.065967 0.075801 0.064500 0.060214 0.068891 0.064993 0.055450 0.077713 0.076425 0.068982 0.070236
1 0.070747 0.070882 0.050330 0.061963 0.076724 0.062193 0.065695 0.061436 0.068397 0.062524 0.049954 0.093780 0.076701 0.064980 0.063693
2 0.086332 0.063053 0.037152 0.042598 0.099212 0.065017 0.061546 0.050136 0.055723 0.066761 0.037708 0.116469 0.081088 0.071017 0.066188
3 0.131403 0.068612 0.025374 0.029843 0.091810 0.044684 0.044783 0.040568 0.055095 0.052669 0.024974 0.061665 0.052602 0.235253 0.040666
4 0.069285 0.069053 0.044518 0.049888 0.058232 0.078048 0.061086 0.068690 0.062046 0.102345 0.043165 0.064314 0.079124 0.093914 0.056291
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
52692 0.096125 0.071386 0.027858 0.041813 0.095278 0.052810 0.040474 0.051293 0.056721 0.133052 0.026874 0.047339 0.051183 0.162787 0.045006
52693 0.085474 0.068257 0.047044 0.056173 0.070794 0.066754 0.061583 0.050914 0.060732 0.076330 0.047374 0.057462 0.060376 0.128952 0.061780
52694 0.073769 0.074035 0.047510 0.057346 0.090340 0.059344 0.056209 0.057094 0.060466 0.061084 0.047341 0.054888 0.058834 0.131114 0.070625
52695 0.083192 0.059174 0.047981 0.057561 0.065598 0.068059 0.071295 0.062680 0.064459 0.084050 0.046833 0.059579 0.062758 0.117769 0.049012
52696 0.065911 0.068391 0.044016 0.043735 0.069573 0.087748 0.063180 0.086749 0.068249 0.082650 0.043095 0.060870 0.063127 0.098716 0.053991

52697 rows × 15 columns

In [232]:
#df_topic_para2 = df_topic_para1.drop(['document_id', 'paragraphs'], axis = 1)
list_perc2 = []
for i in list_L:
    num = perc(i, df_topic_para)
    list_perc2.append(num)

df_L2 = pd.DataFrame({'Parameter L': list_L, 'Percentage of interpretable topics': list_perc2})
fig_L2 = px.line(df_L2, x = 'Parameter L', y="Percentage of interpretable topics", title = 'Value selection for parameter L (paragraph-based )')
fig_L2.update_layout(autosize=False, width=1200, height=400)
fig_L2.update_traces(mode = "lines + markers")
fig_L2.show()

Build topic model on paragraphs

In [ ]:
# tokenization
#data2 = df_para.paragraphs.values.tolist()
#data_words2_2 = list(sent_to_words(data2))
In [ ]:
# set the length of word threshold for removing the words less than the threshold
#minimum_len = 4 
#data_words2 = []
#for i in data_words2_2:
    new_element = [x for x in i if len(x) >= minimum_len]
    data_words2.append(new_element)
In [ ]:
# Bigram & Trigram
#bigram2 = gensim.models.Phrases(data_words2, min_count=5, threshold=100) # higher threshold fewer phrases.
#trigram2 = gensim.models.Phrases(bigram2[data_words2], threshold=100)  
#bigram_mod2 = gensim.models.phrases.Phraser(bigram2)
#trigram_mod2 = gensim.models.phrases.Phraser(trigram2)
In [ ]:
# Remove Stop Words
#data_words_nostops2 = remove_stopwords(data_words2)

# Form Trigrams
#data_words_trigrams2 = make_trigrams(data_words_nostops2)

# Do lemmatization keeping only noun, adj, vb, adv
#data_lemmatized2 = lemmatization(data_words_trigrams2, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV'])
In [ ]:
# Create Dictionary
#id2word2 = corpora.Dictionary(data_lemmatized2)

# Create Corpus
#texts2 = data_lemmatized2

# Term Document Frequency
#corpus2 = [id2word2.doc2bow(text) for text in texts]
In [ ]:
#lda_model2 = gensim.models.LdaModel(
#    corpus=corpus2,
#    id2word=id2word2,
#   alpha=alpha,
#    eta=eta,
#    iterations=iterations,
#    num_topics=k, 
#   passes=passes)
In [ ]:
# Compute Coherence Score
#coherence_model_lda2 = CoherenceModel(model=lda_model2, texts=data_lemmatized2, dictionary=id2word2, coherence='c_v')
#coherence_lda2 = coherence_model_lda2.get_coherence()
#print('\nCoherence Score: ', coherence_lda2)
In [ ]:
# Visualize the topics
#vis2 = pyLDAvis.gensim_models.prepare(lda_model2, corpus2, id2word2, sort_topics = False)
#vis2
In [ ]: